import pandas as pd

from spider.SingleArticleSpider import SingleArticleSpider

# 读取数据且跳过第一行标题
df_temp = pd.read_csv('./csv_collect/big_articles.csv')

# 获取标题行
header = df_temp.iloc[0]
# 从第n行开始切片
n = 5594
df_slice = df_temp.iloc[n:]
# 重置df
df = pd.concat([pd.DataFrame(header).T, df_slice])

count = 1
total = len(df)
deepSpider = SingleArticleSpider()

for i in range(n, len(df)):
    print(i)
    print(df["url"][i])
    if "NOTFOUND" == df["url"][i]:
        print("当前解析位置：第 " + str(count) + "/" + str(total) + " 条")
        print("广告链接或错误数据：NOTFOUND跳过")
    else:
        # for i in range(3):
        print("当前解析位置：第 " + str(count) + "/" + str(total) + " 条")
        deepSpider.run(df["url"][i])

    count = count + 1

print("--------深度爬取细粒度文章成功------正在关闭selenium dirver----------")
deepSpider.destroy()
print("--------完结，撒花----------")
